library(tidyverse)
library(philentropy)

# done once to split up the rdata
# load("/net/waterston/vol6/files_for_lou/data_files_for_lou.RData")
# saveRDS(worm_TF_targets,file="/net/waterston/vol6/files_for_lou/worm_TF_targets.rds")
# saveRDS(exp_matrix_mean,file="/net/waterston/vol6/files_for_lou/exp_matrix_emb_mean.rds")


stageGroups <- list("emb"=list("earlyembryonic","mixedstage","midembryonic","lateembryonic"),
             "L2"=list("L1larva","L2larva","L3larva","dauer"),
             "YA"=list("youngadult","L4larva","L4youngadult"),
             "lineage"=list("earlyembryonic","mixedstage","midembryonic","lateembryonic"))

# filter the targets by the size of the cluster the peak is part of
worm_TF_targets <- readRDS(file="/net/waterston/vol6/files_for_lou/worm_TF_targets.rds")
minPeakCount <- 2
maxPeakCount <- 84
worm_TF_targets <- filter(worm_TF_targets,nPeaks >= minPeakCount)  %>% filter(nPeaks <= maxPeakCount)

# filter the targets by the rank of the peak
minRank <- 0.3
worm_TF_targets <- filter(worm_TF_targets,rank>=minRank)
worm_TF_targets <- mutate(worm_TF_targets,lifeStage=str_split_i(peakID,"_",3)) # split out the lifeStage from the id

# limit to one target for the tf
worm_TF_targets <- distinct(worm_TF_targets,TF,Gene,.keep_all = TRUE)

#group <- "emb"
for (group in names(stageGroups)){
 # print(group)
  label <- str_c(group,as.integer(10*minRank),sep="_")
  
  # filter the peaks for the life stage
  stageTargets <- filter(worm_TF_targets,lifeStage %in% stageGroups[[group]])

  # use target only once 
  
  # read the expression matrix to use for the group of stages - genes by cell types
  allExpr <- readRDS(file=str_c("/net/waterston/vol6/files_for_lou/exp_matrix_",group,"_mean.rds",sep=""))
  genes <- rownames(allExpr)
  
  # form a dataframe with expression values for all genes - fix some gene names
  allExpr <- as.data.frame(allExpr) 
  allExpr <- mutate(allExpr,gene=genes,.before=1)
  allExpr <- mutate(allExpr,gene=replace(gene,gene=="dxbp-1","dox-1"))
  allExpr <- mutate(allExpr,gene=replace(gene,gene=="svh-5","tag-97"))
  allExpr <- mutate(allExpr,gene=replace(gene,gene=="hinf-1","F39B2.1"))
  allExpr <- mutate(allExpr,gene=replace(gene,gene=="klu-1","ZK337.2"))
  allExpr <- mutate(allExpr,gene=replace(gene,gene=="B0035.1","znf-207"))
  
  # get the expression of the tfs
  tfs <- distinct(stageTargets,TF)
  alltfExpr <- left_join(tfs,allExpr,by=c("TF"="gene"))  
  
 #i <- 1
  l <- list()
  rl <- list()
  for (i in 1:nrow(tfs)) {
    tf <- tfs[i,1]
 #   print(c(i,tf))
    
    # get the tpm values for the tf
    tfExpr <- filter(alltfExpr,TF==tf)
    tfExpr <-  pivot_longer(tfExpr,cols = 2:ncol(tfExpr),names_to = "cellType",values_to="TF_TPM")
    
    # get the TPM values for the targets
    targets <- filter(stageTargets,TF==tf)
    targetExpr <- left_join(select(targets,"TF","Gene"),allExpr,by=c("Gene"="gene")) 
    targetExpr <- na.omit(targetExpr)
    nTargets <- nrow(targetExpr)
    targetExpr <- pivot_longer(targetExpr,cols = 3:ncol(targetExpr),names_to = "cellType",values_to="targetTPM")
    
    j <- left_join(targetExpr,tfExpr,by=c("TF","cellType"))
    angles <- group_by(j,Gene) %>% summarize(CosineAngle=angleCosine(targetTPM,TF_TPM)) %>% mutate(TF=tf)
    l[[tf]] <- left_join(angles,targets)
    
    # do the same for random targets
    randomExpr <- sample_n(allExpr,nTargets) %>% dplyr::rename(Gene=gene)
    randomExpr <- pivot_longer(randomExpr,cols = 2:ncol(randomExpr),names_to = "cellType",values_to="targetTPM") 
    j <- left_join(randomExpr,tfExpr,by=c("cellType"))
    randomAngles <- group_by(j,Gene) %>% summarize(CosineRandomAngle=angleCosine(targetTPM,TF_TPM)) %>% mutate(TF=tf)
    rl[[tf]] <- randomAngles      
  }  
  a <- bind_rows(l) %>%  filter(!is.na(CosineAngle))
  ra <- bind_rows(rl)  %>%  filter(!is.na(CosineRandomAngle))
  write_tsv(a,file=str_c("/net/waterston/vol6/files_for_lou/TF_TargetAngles_2_84_",label,".tsv",sep=""))
  write_tsv(ra,file=str_c("/net/waterston/vol6/files_for_lou/TF_RandomAngles_2_84_",label,".tsv",sep=""))
  
  a <- dplyr::rename(a,Angle = CosineAngle) %>% mutate(Source="Target") 
  ra <- dplyr::rename(ra,Angle = CosineRandomAngle) %>% mutate(Source="Random") 
  a <- bind_rows(a,ra)
  write_tsv(a,file=str_c("/net/waterston/vol9/ChipSeqPipeline/TF_Angle_2_84_",label,".tsv",sep=""))    
}

for (group in names(stageGroups)){
 # print(group)
  label <- str_c(group,as.integer(10*minRank),sep="_")
  a <- read_tsv(file=str_c("/net/waterston/vol9/ChipSeqPipeline/TF_Angle_2_84_",label,".tsv",sep="")) 
  tfs <- distinct(a,TF)
  
  # do a t-test , wilcox and chi square test for each tf
  
  tlist <- list()
  for (i in 1:nrow(tfs)) {

    tf <- as.character(tfs[i,1])
  #  print(c(i,tf))
    targetAngles <- filter(a,TF==tf,Source=="Target")
    randomAngles <- filter(a,TF==tf,Source=="Random")
    if (nrow(targetAngles) > 2 & nrow(randomAngles) > 2){
      
      # do jsd of target vs random angles
      targetHist <- hist(targetAngles$Angle,breaks=seq(-1,1,.1),plot=FALSE)
      targetDist <- targetHist$counts/sum(targetHist$counts)
      randomHist <- hist(randomAngles$Angle,breaks=seq(-1,1,.1),plot=FALSE)
      randomDist <- randomHist$counts/sum(randomHist$counts)
      js <- jensen_shannon(targetDist,randomDist,testNA=FALSE,unit="log")
      
      
      df <- filter(a,TF==tf)
      t <- t.test(Angle ~ Source, data=df)
      w <- wilcox.test(Angle ~ Source, data=df)
      
      q<-quantile(df$Angle , seq(from=0, to=1.0, .07))
      df$outcome_bin <- cut(df$Angle, breaks=q, include.lowest=T)
      tab<-with(df, table(outcome_bin, Source))
      chiS <- chisq.test(tab)
      
      tlist[[tf]] <- tibble(TF=tf,nTargets=nrow(targetAngles),JSD=sqrt(js),wilcoxP=w$p.value,chiP=chiS$p.value,
                            t_test_p=t$p.value,targetMean=t$estimate[2],randomMean=t$estimate[1])  
    }
  }
  ttest <- bind_rows(tlist)
  saveRDS(ttest,str_c("/net/waterston/vol9/ChipSeqPipeline/Worm_",label,"_JSD_ChiSquare_ttest.rds",sep=""))
}


ggplot(jsd,aes(x=Angle_jsd)) + geom_density()

counts <- dplyr::group_by(a,TF) %>% dplyr::count() 
counts <- left_join(counts,jsd)
ggplot(counts,aes(x=n,y=Angle_jsd)) + geom_point()

tf <- "hlh-1"
tf <- "pha-4"
tfjsd <- filter(counts,TF==tf)

 ggplot(filter(a,TF==tf),aes(x=Angle,color=Source)) + geom_density() + 
   labs(title=str_c(tf,tfjsd[1,2],tfjsd[1,3],sep=" "))
 
build <- ggplot_build(plot)


